%%HTML
<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script>
import pandas as pd
# custom library
from TextPreProcessor import TextInputProcessor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import re
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import os
import numpy as np
import matplotlib.pyplot as plt
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import plotly.io as pio
pio.renderers.default = "notebook+pdf"
def cleanText(text):
# remove newline
text = re.sub(r'\s+', ' ', text)
return text.lower().strip()
df = pd.read_csv("spamham.csv")
MAPPING = { 1 : "spam", 0: "ham"}
def mergeDfs(datasetPaths):
df = pd.read_csv(datasetPaths[0])
for i in range(1, len(datasetPaths)):
tmp = pd.read_csv(datasetPaths[i])
tmp = tmp.drop("Unnamed: 0", axis = 1)
df = pd.concat([df, tmp])
return df
df = mergeDfs(['spamham.csv', 'spam_ham_dataset.csv', 'spamHamData.csv'])
df.head()
| text | spam | label | |
|---|---|---|---|
| 0 | Subject: naturally irresistible your corporate... | 1 | NaN |
| 1 | Subject: the stock trading gunslinger fanny i... | 1 | NaN |
| 2 | Subject: unbelievable new homes made easy im ... | 1 | NaN |
| 3 | Subject: 4 color printing special request add... | 1 | NaN |
| 4 | Subject: do not have money , get software cds ... | 1 | NaN |
#convert to lowercase
df["text"] = df["text"].apply(lambda x: cleanText(x[len("subject:"): ] ) )
df.isna().sum()
text 0 spam 0 label 5728 dtype: int64
df = df.drop("label", axis = 1)
df.head()
| text | spam | |
|---|---|---|
| 0 | naturally irresistible your corporate identity... | 1 |
| 1 | the stock trading gunslinger fanny is merrill ... | 1 |
| 2 | unbelievable new homes made easy im wanting to... | 1 |
| 3 | 4 color printing special request additional in... | 1 |
| 4 | do not have money , get software cds from here... | 1 |
df.groupby("spam").describe()
| text | ||||
|---|---|---|---|---|
| count | unique | top | freq | |
| spam | ||||
| 0 | 10583 | 10128 | calpine daily gas nomination > ricky a . arche... | 20 |
| 1 | 3368 | 3326 | 16 | |
fig = go.Figure(
go.Bar(x = list(map(lambda x : MAPPING[x], set(df.spam))), y = df.groupby("spam").count().text.tolist())
)
fig.update_layout(
title = "Distribution of Classes",
yaxis_title = "Count",
xaxis_title = "Class"
)
fig.show()
# email filter with length > 10000
df = df[df["text"].apply(lambda x: len(x) < 10000)]
df.head()
| text | spam | |
|---|---|---|
| 0 | naturally irresistible your corporate identity... | 1 |
| 1 | the stock trading gunslinger fanny is merrill ... | 1 |
| 2 | unbelievable new homes made easy im wanting to... | 1 |
| 3 | 4 color printing special request additional in... | 1 |
| 4 | do not have money , get software cds from here... | 1 |
Here, the dataset is imbalanced. So, when we use NN we may end with with a local minima
# plot distrubution of message length of messages (histogram)
hist1 = go.Histogram(x = [len(row.text) for ind, row in df[df.spam == 1].iterrows()],
nbinsx=8, name = "Spam Length")
hist2 = go.Histogram(x = [len(row.text) for ind, row in df[df.spam == 0].iterrows()],
nbinsx=8, name = "Ham Length")
fig = go.Figure(data = [hist1, hist2])
fig.update_layout(
title = "Histogram of message length of Spam vs Non spam emails",
xaxis_title = "Message length",
yaxis_title = "Count of Range",
legend_title="Type of Email",
)
fig.show()
Split in ratio 0.3
Three Steps :
tfidf_vec = TfidfVectorizer(smooth_idf=True,use_idf=True)
textFeatures = tfidf_vec.fit_transform(df["text"])
tfidf_df = pd.DataFrame(textFeatures.toarray(), columns = tfidf_vec.get_feature_names_out())
tfidf_df[4:10].T
| 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|
| 00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
| 000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.071134 | 0.0 |
| 0000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
| 000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
| 00000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... |
| zymg | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
| zzmacmac | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
| zzn | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
| zzncacst | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
| zzzz | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
35406 rows × 6 columns
tfidf_vec.get_feature_names_out()
array(['00', '000', '0000', ..., 'zzn', 'zzncacst', 'zzzz'], dtype=object)
Use a split of 70% train and 30% train
X_train, X_test, y_train, y_test = train_test_split(df.text, df.spam, test_size=0.3, random_state=111)
print(X_train)
print(f"Training set : {X_train.shape}")
2477 times 2 filing units pat : recently , i talked...
3363 kwbt bio - tech signs letter of intent , gcm a...
3599 cialis , xanax , valium , viagra at low price ...
213 how are you today ? orchard m . guei . republi...
4246 global risk management operations recognizing ...
...
1760 dave n out until july 5 th i will be taking a ...
4219 re : recruiting at cmu computational finance p...
4858 re : introduction i would be very happy to par...
4522 from 17 because paliourg , # valiumxanaxcialis...
1308 -list-admin@freshrpms.net wed oct 2 11:45:08 2...
Name: text, Length: 9657, dtype: object
Training set : (9657,)
print(y_train)
2477 0
3363 1
3599 1
213 1
4246 0
..
1760 0
4219 0
4858 0
4522 1
1308 0
Name: spam, Length: 9657, dtype: int64
spamPipe = Pipeline([
('text_preProcess', TextInputProcessor()),
('tfidf', TfidfVectorizer()),
('SVM', SVC(kernel='sigmoid', gamma=1.0))
])
print(spamPipe)
Pipeline(steps=[('text_preProcess',
<TextPreProcessor.TextInputProcessor object at 0x000001FF6A15AD90>),
('tfidf', TfidfVectorizer()),
('SVM', SVC(gamma=1.0, kernel='sigmoid'))])
spamPipe.fit(X_train, y_train)
Pipeline(steps=[('text_preProcess',
<TextPreProcessor.TextInputProcessor object at 0x000001FF6A15AD90>),
('tfidf', TfidfVectorizer()),
('SVM', SVC(gamma=1.0, kernel='sigmoid'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('text_preProcess',
<TextPreProcessor.TextInputProcessor object at 0x000001FF6A15AD90>),
('tfidf', TfidfVectorizer()),
('SVM', SVC(gamma=1.0, kernel='sigmoid'))])<TextPreProcessor.TextInputProcessor object at 0x000001FF6A15AD90>
TfidfVectorizer()
SVC(gamma=1.0, kernel='sigmoid')
# model metrics
def showMetrics(testData, truths, model):
preds = model.predict(testData)
cm = confusion_matrix(truths, preds)
print("Confusion Matrix")
print(cm)
print("Classification report")
print(classification_report(truths, preds, target_names = ["ham", "spam"]) )
ax = sns.heatmap(cm, annot=True, fmt='d')
ax.set(title = "HeatMap of Predictions", xlabel="Predicted", ylabel="Truth")
plt.show()
showMetrics(X_test, y_test, spamPipe)
Confusion Matrix
[[3111 26]
[ 17 985]]
Classification report
precision recall f1-score support
ham 0.99 0.99 0.99 3137
spam 0.97 0.98 0.98 1002
accuracy 0.99 4139
macro avg 0.98 0.99 0.99 4139
weighted avg 0.99 0.99 0.99 4139
# sample function in sklearn to print learning curves
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
plot_learning_curve(spamPipe, "Learning Curves (SVM, Sigmoid Kernel, $\gamma=1$)", X_train, y_train)
<module 'matplotlib.pyplot' from 'd:\\Python\\Python39\\lib\\site-packages\\matplotlib\\pyplot.py'>
# test on random spam mail from inbox
examples = ["""
I am a senior official from the World Health Organization(WHO), I was instructed to contact you regarding on-going Compensation Grant Awards for Covid-19, approved by the World Health Organization alongside the United Nations as compensation payment/rewards for eligible beneficiaries.
Have you received your Grant Award payment of US$500,000 for COVID-19? If NO! Kindly be informed that you are among the lucky winners randomly selected to benefit from this relief program.
Contact the info below to reach Grant Award Officer-""",
"""
I am Charles Rettig, US Commissioner of Internal Revenue Services.
This is to inform you about the release of your overdue benefits fund
of $3,900,000.00 USD which was on hold for a long time. Your
consignment box of $3.9 Million United States Dollars has been
approved in your name by all the United States Federal enforcement law
for funds and is ready to deliver to you as soon as you reconfirm your
mailing address.
As this matter is urgent, I look forward to hearing from you as soon
as possible.
Please reply with your needed information such as
Full name..........
Address..........
Phone number .........
Country........
Occupation......
Regards. Charles Rettig.""", """Hi
Class "Post Analysis: BST, Heaps and Map" has been scheduled on 14 Oct 2022 07:00PM
Duration : 120 Mins
Mentor : jaydals0eo8""",
"""Link: https://attendee.gotowebinar.com/register/2135259703821289230
Welcome, All! In this webinar, we will discuss the following:
* Cloud computing market will be worth $800 billion by 2025
* 30,000+ Job openings for freshers in AWS
* Career opportunities in AWS Cloud Computing
* Avg. salary of Rs. 4.5 LPA
* Continuous placement support
* Companies hiring 2022, 2021, 2020, and 2019 batch students
* Why AWS is the hot domain in MNCs
When
Friday 21 Oct 2022 ⋅ 6pm – 7pm (India Standard Time - Kolkata)
Location
https://attendee.gotowebinar.com/register/2135259703821289230
View map
Organiser
yasmin.taj@ethnus.com"""]
pred = spamPipe.predict(examples)
print(f"Predictions = {pred}")
Predictions = [1 1 0 0]
# saving model
import joblib
with open('spam_model.pkl', 'wb') as f:
joblib.dump(spamPipe, f)
spamPipe1 = Pipeline([
('text_preProcess', TextInputProcessor()),
('tfidf', TfidfVectorizer()),
('SVM', RandomForestClassifier(n_estimators=15, random_state = 11))
])
print(spamPipe1)
spamPipe1.fit(X_train, y_train)
Pipeline(steps=[('text_preProcess',
<TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04EE0>),
('tfidf', TfidfVectorizer()),
('SVM',
RandomForestClassifier(n_estimators=15, random_state=11))])
Pipeline(steps=[('text_preProcess',
<TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04EE0>),
('tfidf', TfidfVectorizer()),
('SVM',
RandomForestClassifier(n_estimators=15, random_state=11))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('text_preProcess',
<TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04EE0>),
('tfidf', TfidfVectorizer()),
('SVM',
RandomForestClassifier(n_estimators=15, random_state=11))])<TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04EE0>
TfidfVectorizer()
RandomForestClassifier(n_estimators=15, random_state=11)
showMetrics(X_test, y_test, spamPipe1)
Confusion Matrix
[[3105 32]
[ 128 874]]
Classification report
precision recall f1-score support
ham 0.96 0.99 0.97 3137
spam 0.96 0.87 0.92 1002
accuracy 0.96 4139
macro avg 0.96 0.93 0.95 4139
weighted avg 0.96 0.96 0.96 4139
spamPipe2 = Pipeline([
('text_preProcess', TextInputProcessor()),
('tfidf', TfidfVectorizer()),
('SVM', KNeighborsClassifier(n_neighbors=50))
])
print(spamPipe2)
spamPipe2.fit(X_train, y_train)
Pipeline(steps=[('text_preProcess',
<TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04A90>),
('tfidf', TfidfVectorizer()),
('SVM', KNeighborsClassifier(n_neighbors=50))])
Pipeline(steps=[('text_preProcess',
<TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04A90>),
('tfidf', TfidfVectorizer()),
('SVM', KNeighborsClassifier(n_neighbors=50))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('text_preProcess',
<TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04A90>),
('tfidf', TfidfVectorizer()),
('SVM', KNeighborsClassifier(n_neighbors=50))])<TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04A90>
TfidfVectorizer()
KNeighborsClassifier(n_neighbors=50)
showMetrics(X_test, y_test, spamPipe2)
Confusion Matrix
[[3051 86]
[ 60 942]]
Classification report
precision recall f1-score support
ham 0.98 0.97 0.98 3137
spam 0.92 0.94 0.93 1002
accuracy 0.96 4139
macro avg 0.95 0.96 0.95 4139
weighted avg 0.97 0.96 0.96 4139